The purpose of this part is to visualize all the data and assess them to help me prepare for model fitting, which is also responsible for the 1st part of final project requirements.
library(tidyverse)
library(visdat)
df_all <- readr::read_csv("final_project_train.csv", col_names = TRUE)
df_all %>% glimpse()
## Rows: 677
## Columns: 38
## $ rowid <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, 27,…
## $ region <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "…
## $ customer <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ xb_01 <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429, 3…
## $ xb_02 <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13, 10…
## $ xb_03 <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4, -3…
## $ xn_01 <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8571…
## $ xn_02 <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, 6, …
## $ xn_03 <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5, -…
## $ xa_01 <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.857143, …
## $ xa_02 <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, 26,…
## $ xa_03 <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4, -5…
## $ xb_04 <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1857…
## $ xb_05 <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0.00…
## $ xb_06 <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000, 6…
## $ xb_07 <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286, 1…
## $ xb_08 <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.35483871,…
## $ xn_04 <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5607…
## $ xn_05 <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -1.0…
## $ xn_06 <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5, 2…
## $ xn_07 <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857, 1…
## $ xn_08 <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129, -0…
## $ xa_04 <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714, 2…
## $ xa_05 <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0.40…
## $ xa_06 <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.000000, …
## $ xa_07 <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429, 4…
## $ xa_08 <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3571…
## $ xw_01 <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333, 6…
## $ xw_02 <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4, 2…
## $ xw_03 <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 103, …
## $ xs_01 <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442957…
## $ xs_02 <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.12204299,…
## $ xs_03 <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6540…
## $ xs_04 <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2594…
## $ xs_05 <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.043…
## $ xs_06 <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8672…
## $ response <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279, 1…
## $ outcome <chr> "non_event", "non_event", "event", "non_event", "non_event", …
No missing data
visdat::vis_miss(df_all)
df_all %>% purrr::map_dbl(n_distinct)
## rowid region customer xb_01 xb_02 xb_03 xn_01 xn_02
## 677 3 9 229 19 21 225 18
## xn_03 xa_01 xa_02 xa_03 xb_04 xb_05 xb_06 xb_07
## 18 257 38 35 364 59 51 181
## xb_08 xn_04 xn_05 xn_06 xn_07 xn_08 xa_04 xa_05
## 187 360 51 47 174 174 411 87
## xa_06 xa_07 xa_08 xw_01 xw_02 xw_03 xs_01 xs_02
## 87 213 212 396 102 103 676 644
## xs_03 xs_04 xs_05 xs_06 response outcome
## 672 676 663 676 677 2
Q: Counts for categorical variables.
A: From the figure, it looks like very imbalanced.
df_all %>%
count(outcome) %>%
ggplot(mapping = aes(x = outcome, y = n))+
geom_col()
Q: Distributions for continuous variables. Are the distributions Gaussian like?
A: Most of them looks Gaussian like
df_con_all <- select(df_all, starts_with("x"))
df_con_all %>%
select(all_of(colnames(df_con_all))) %>%
tibble::rowid_to_column() %>%
pivot_longer(!c("rowid")) %>%
ggplot()+
geom_density(mapping = aes(x = value), adjust = 1.35, size = 0.5)+
facet_wrap(~name, scales = "free")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
df_all %>%
ggplot(mapping = aes(x = log(response)))+
geom_histogram(bins = 35)+
geom_rug(alpha = 0.2) +
theme_bw()
Q: Are there differences in continuous variable distributions and continuous variable summary statistics based on region or customer?
A: Yes, if we zoom in, we do observe some differences, but not very significant.
df_trans_02 <- df_all %>%
subset(select = -c(rowid, outcome, response)) %>%
pivot_longer(!c("region", "customer"))
df_trans_02 %>%
ggplot(mapping = aes(x = name, color = as.factor(region)))+
geom_density()+
facet_wrap( ~ name, scales = "free_y")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
Again, through this figure, we can see the different median of continuous variables between different region.
df_trans_02 %>%
ggplot(mapping = aes(y = as.factor(name), x = value))+
geom_boxplot(mapping = aes(fill = as.factor(region), color = as.factor(region)),
alpha = 0.35, size = 0.1)+
facet_wrap(~ name, scales = "free")+
scale_fill_viridis_d("Region") +
scale_color_viridis_d("Region") +
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
Q: Are there differences in continuous variable distributions and continuous variable summary statistics based on the binary outcome?
A: Yes, if we zoom in, the differences are relative obviously
df_trans_02 %>%
ggplot(mapping = aes(x = name, color = customer))+
geom_density()+
facet_wrap( ~ name, scales = "free_y")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
df_trans_02 %>%
ggplot(mapping = aes(x = as.factor(name), y = value))+
geom_boxplot(mapping = aes(fill = as.factor(customer), color = as.factor(customer)),
alpha = 0.35, outlier.size = 0.1)+
facet_wrap(~ name, scales = "free")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
Q: Visualize the relationships between the continuous inputs, are they correlated?
A: Some of inputs are highly correlated to each other
df_con_all %>%
cor() %>%
corrplot::corrplot(type = "upper")
Q: Visualize the relationships between the continuous outputs (response and the log-transformed response) with respect to the continuous inputs. Can you identify any clear trends? Do the trends depend on the categorical inputs?
A: The input increase as the some of output parameters increase, like xa_01, xa_02 and xa_03. But some of them are not, and as we can see, the categorical input does have impact on the output prediction.
df_trans_04 <- df_all %>%
mutate(log_response = log(response)) %>%
select(starts_with('x'), log_response, response, customer, region) %>%
pivot_longer(!c(log_response, response, customer, region))
df_trans_04 %>% count()
## # A tibble: 1 × 1
## n
## <int>
## 1 22341
df_trans_04 %>%
ggplot(mapping = aes(x = value, y = log_response, color = region))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~name, scales = "free")+
theme_bw()+
scale_color_viridis_d(option = 'plasma') +
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
df_trans_04 %>%
ggplot(mapping = aes(x = value, y = log_response, color = customer))+
geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
facet_wrap(~name, scales = "free")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())
Q:How can you visualize the behavior of the binary outcome with respect to the continuous inputs?
A: As shown below, we can’t just draw a vertical line to divide them.
df_all %>%
mutate(log_response = log(response)) %>%
select(starts_with('x'), outcome) %>%
pivot_longer(!c(outcome)) %>%
ggplot(mapping = aes(x = value, y = outcome))+
geom_point(mapping = aes(color = outcome),size = 0.1)+
facet_wrap(~name, scales = "free")+
theme_bw()+
theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())